library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(gganimate)
library(gifski)
library(png)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
library(geosphere)
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
#Weather data already incorporated into SampleCitiBike.csv
rawdata <- read.csv("SampleCitiBike.csv")
sampleData <- sample_frac(rawdata, 0.01)
write.csv(sampleData, "new_MASTER_01_data.csv")
masterdata <- read.csv("new_MASTER_01_data.csv")
masterdata$X <- NULL
masterdata$starttime <- NULL
masterdata$stoptime <- NULL
masterdata$start.station.id <- as.factor(masterdata$start.station.id)
masterdata$start.station.name <- as.factor(masterdata$start.station.name)
masterdata$end.station.id <- as.factor(masterdata$end.station.id)
masterdata$end.station.name <- as.factor(masterdata$end.station.name)
masterdata$bikeid <- as.factor(masterdata$bikeid)
masterdata$usertype <- as.factor(masterdata$usertype)
masterdata <- rename(masterdata, startTime = newStartTime, stopTime = newStopTime)
masterdata$startTime <- as.POSIXct(strptime(masterdata$startTime, "%Y-%m-%d %H:%M:%S"))
masterdata$stopTime <- as.POSIXct(strptime(masterdata$stopTime, "%Y-%m-%d %H:%M:%S"))
masterdata$startDate <- as.Date(masterdata$startTime)
masterdata$stopDate <- as.Date(masterdata$stopTime)
masterdata$distMeters <- distHaversine(cbind(masterdata$start.station.latitude, masterdata$start.station.longitude), cbind(masterdata$end.station.latitude, masterdata$end.station.longitude))
masterdata$ageGroup <- as.factor(ifelse(masterdata$birth.year >= 2000, "GenZ", ifelse(masterdata$birth.year >= 1981, "Millennial", ifelse(masterdata$birth.year >= 1965, "GenX", ifelse(masterdata$birth.year >= 1946, "Boomer", ifelse(masterdata$birth.year >= 1928, "Silent", "VeryOld"))))))
masterdata$ageGroup <- factor(masterdata$ageGroup, levels = c("GenZ", "Millennial", "GenX", "Boomer", "Silent", "VeryOld"))
masterdata$startMonth <- month(masterdata$startDate)
masterdata$stopMonth <- month(masterdata$stopDate)
masterdata$startMonthFactor <- as.factor(month(masterdata$startDate))
masterdata$stopMonthFactor <- as.factor(month(masterdata$stopDate))
masterdata$seasonStart <- as.factor(ifelse(masterdata$startMonth >= 3 & masterdata$startMonth <= 5, "Spring", ifelse(masterdata$startMonth >= 6 & masterdata$startMonth <= 8, "Summer", ifelse(masterdata$startMonth >= 9 & masterdata$startMonth <= 11, "Fall", "Winter"))))
masterdata$seasonStart <- factor(masterdata$startMonth, levels = c("Spring", "Summer", "Fall", "Winter"))
masterdata$numWeekday <- as.factor(wday(masterdata$startDate))
#Defining rush hour as 6-10AM and 4-8PM
masterdata$rushHour <- as.factor(ifelse(masterdata$numWeekday == 1 | masterdata$numWeekday == 7, "No", ifelse(hour(masterdata$startTime) < 6 | hour(masterdata$startTime) > 10 & hour(masterdata$startTime) < 16 | hour(masterdata$startTime) > 20, "No", "Yes")))
masterdata <- rename(masterdata, maxTemp = TMAX, minTemp = TMIN)
masterdata$weekNum <- as.numeric(strftime(masterdata$startDate, format = "%V"))
masterdata$speedMetersperSec <- masterdata$distMeters / masterdata$tripduration
masterdata <- rename(masterdata, avgTemp = TAVG)
masterdata$tempFeel <- as.factor(ifelse(masterdata$maxTemp < 40, "Frigid", ifelse(masterdata$maxTemp < 58, "Cold", ifelse(masterdata$maxTemp < 65, "Cool", ifelse(masterdata$maxTemp < 75, "Warm", ifelse(masterdata$maxTemp < 95, "Hot", "Blazing"))))))
masterdata$tempFeel <- factor(masterdata$tempFeel, levels = c("Frigid", "Cold", "Cool", "Warm", "Hot", "Blazing"))
masterdata$gender <- as.factor(ifelse(masterdata$gender == "0", "Unknown", ifelse(masterdata$gender == "1", "Male", "Female")))
masterdata$timeOfDay <- ifelse(hour(masterdata$startTime) >= 0 & hour(masterdata$startTime) < 12, "morning", ifelse(hour(masterdata$startTime) >= 12 & hour(masterdata$startTime) <=24, "afternoon","night"))
masterdata$roundedSNOW <- floor(masterdata$SNOW)
masterAM <- filter(masterdata, masterdata$timeOfDay == "morning")
masterPM <- filter(masterdata,masterdata$timeOfDay == "afternoon")
weekdayData <- subset(masterAM, subset = (masterAM$numWeekday != "7" & masterAM$numWeekday != "1"))
weekendData <- subset(masterAM, subset = (masterAM$numWeekday == "7" | masterAM$numWeekday == "1"))
#create column for trip duration in minutes
masterdata$tripdurationmin <- masterdata$tripduration / 60
#create Mode function to find the most common levels within variables
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
#apply Mode function to find the most common levels within variables
#Gender mode
Mode(masterdata$gender)
## [1] Male
## Levels: Female Male Unknown
#Start station modes
Mode(masterdata$start.station.name)
## [1] Pershing Square North
## 908 Levels: 1 Ave & E 110 St 1 Ave & E 16 St 1 Ave & E 18 St ... Yankee Ferry Terminal
#End station mode
Mode(masterdata$end.station.name)
## [1] Pershing Square North
## 908 Levels: 1 Ave & E 110 St 1 Ave & E 16 St 1 Ave & E 18 St ... Yankee Ferry Terminal
#BikeID mode
Mode(masterdata$bikeid)
## [1] 35306
## 19094 Levels: 14529 14530 14531 14533 14534 14535 14536 14537 14539 ... 42046
#Birth year mode
Mode(masterdata$birth.year)
## [1] 1969
#User type mode
Mode(masterdata$usertype)
## [1] Subscriber
## Levels: Customer Subscriber
#Most frequent age
2020 - Mode(masterdata$birth.year)
## [1] 51
#Longest bike ride
max(masterdata$tripdurationmin)
## [1] 44633.38
#Mean trip duration by gender
tapply(masterdata$tripdurationmin, masterdata$gender, mean)
## Female Male Unknown
## 16.51171 13.61907 42.76717
#Mean trip duration by usertype
tapply(masterdata$tripdurationmin, masterdata$usertype, mean)
## Customer Subscriber
## 37.92914 13.05964
#How many times did a rider forget to return a bike? I am defining this using more than 15 hours of rental = 900 minutes
nrow(masterdata[masterdata$tripdurationmin > 900 ,])
## [1] 98
Average and Median Trip Duration
# Average
mean(masterdata$tripduration)
## [1] 992.7189
# Median
median(masterdata$tripduration)
## [1] 616
The average trip lasts a little over 16 minutes, but the median trip duration is only around 10 minutes (there are likely long trips that are skewing the data)
Average and Median Rider Birth Year
# Average
mean(masterdata$birth.year)
## [1] 1980.174
# Median
median(masterdata$birth.year)
## [1] 1983
The average rider is ~39 years old
Number of Bikes in The Sample
length(levels(masterdata$bikeid))
## [1] 19094
Percent Customer vs. Subscriber
# Number of Customers in Sample
numCustomers <- nrow(masterdata[masterdata$usertype == "Customer", ])
# Number of Subscribers in Sample
numSubscribers <- nrow(masterdata[masterdata$usertype == "Subscriber", ])
# Total Users in Sample
totalUsers <- numCustomers + numSubscribers
14.0158722% of Users are Customers 85.9841278% of Users are Subscribers
Percent Male/Female
# Number of Males in Sample
numMale <- nrow(masterdata[masterdata$gender == "1", ])
# Number of Females in Sample
numFemale <- nrow(masterdata[masterdata$gender == "2", ])
# Total People in Sample
totalPeople <- numCustomers + numSubscribers
0% of Riders are Male 0% of Riders are Female
Number of Stations in Sample (Start and End)
# Start Stations
length(levels(masterdata$start.station.name))
## [1] 908
# End Stations
length(levels(masterdata$end.station.name))
## [1] 908
Discrepancy in number of start and stop stations likely caused by variance caused by random sample
Average Trip Duration by Start Station
barplot(tapply(masterdata$tripduration, masterdata$start.station.name, mean, na.rm = TRUE))
Some stations have significantly higher trip duration than the expected average of 992.7188894 seconds. This may signal that at some of these stations bikes are being lost or possibly not returned.
Average Trip Duration by Month
barplot(tapply(masterdata$tripduration, masterdata$startTime, mean, na.rm = TRUE))
It seems that regardless of month the average trip durations vary around the same level. The number of trips may change, but trip duration is relatively consistent.
Is there any correlation between a rider’s birth year (age) and the length of their rides? How do usertypes and genders factor into this potential correlation?
ggplot(data=masterdata, aes(x=birth.year,y=tripdurationmin, colour=gender)) + geom_point()
#limiting to below 2500 minutes to make visual more legible
ggplot(data=masterdata[masterdata$tripdurationmin <= 2500,], aes(x=birth.year,y=tripdurationmin, colour=gender)) + geom_point()
ggplot(data=masterdata[masterdata$tripdurationmin <= 2500,], aes(x=birth.year,y=tripdurationmin, colour=usertype)) + geom_point()
The first plot is not very easy to read, as the outliers near the top end of the “tripdurationmin” range are making the graph scale hard to interpret; thus, I limit the range for following visualizations. An interesting insight from the next visualization is that, while most riders are born in recent years, the few older riders tend to use the bikes for far shorter rides on an aggregate level. Gender does not seem to play much role here. In the final plot, it is interesting to note that the older riders that use CitiBike tend to be subscribers more often than customers.
Which gender had the most rides?
ggplot(data=masterdata, aes(x=gender)) + geom_bar()
Males had the most rides.
What is the distribution of trip duration as it relates to gender and usertype?
ggplot(data=masterdata, aes(x=gender,y=tripdurationmin)) + geom_boxplot()
#limiting to below 100 minutes to make visual more legible
ggplot(data=masterdata[masterdata$tripdurationmin <= 100,], aes(x=gender,y=tripdurationmin)) + geom_boxplot()
ggplot(data=masterdata[masterdata$tripdurationmin <= 100,], aes(x=usertype,y=tripdurationmin)) + geom_boxplot()
ggplot(data=masterdata[masterdata$tripdurationmin <= 100,], aes(x=gender,y=tripdurationmin)) + geom_violin()
ggplot(data=masterdata[masterdata$tripdurationmin <= 100,], aes(x=usertype,y=tripdurationmin)) + geom_violin()
This set of visualizations makes it clear that, although there is an extensive range of tripduration lengths, the bulk of the distribution is concentrated within less than 100 minutes. Within genders, the male rides are more tightly distributed around their median than female rides are. Male rides, on average, are slightly shorter. Within usertypes, customers on average have higher average trip durations than subscribers. Many hypotheses can be formed as to why these comparisons are as such, but they can not be tested using this data set. For example, maybe customers tend to be travelers or occasional leisure riders who use the bikes on a one-off basis and choose to take their time, while subscribers may be regularly using these bikes to quickly commute to and from work, thus resulting in shorter ride lengths and a tighter distribution around the mean. However, again, these are merely hypotheses that come to mind when looking at the visualization, but these are not entirely supported by the data.
#Ride Distance by Maximum Temperature by Gender
ggplot(data = masterdata, aes(x = maxTemp, y = distMeters, colour = gender)) + geom_point() + labs(x = "Maximum Temperature", y = "Ride Distance (meters)", colour = "Gender")
ggplot(data = masterdata, aes(x = maxTemp, y = distMeters, colour = gender)) + geom_smooth() + labs(x = "Maximum Temperature", y = "Ride Distance (meters)", colour = "Gender")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#Ride Distance by Maximum Temperature by User Type
ggplot(data = masterdata, aes(x = maxTemp, y = distMeters, colour = usertype)) + geom_point() + labs(x = "Maximum Temperature", y = "Ride Distance (meters)", colour = "User Type")
ggplot(data = masterdata, aes(x = maxTemp, y = distMeters, colour = usertype)) + geom_smooth() + labs(x = "Maximum Temperature", y = "Ride Distance (meters)", colour = "User Type")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#Ride Distance by Minimum Temperature by Gender
ggplot(data = masterdata, aes(x = minTemp, y = distMeters, colour = gender)) + geom_point() + labs(x = "Minimum Temperature", y = "Ride Distance (meters)", colour = "Gender")
ggplot(data = masterdata, aes(x = minTemp, y = distMeters, colour = gender)) + geom_smooth() + labs(x = "Minimum Temperature", y = "Ride Distance (meters)", colour = "Gender")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#Ride Distance by Minimum Temperature by User Type
ggplot(data = masterdata, aes(x = minTemp, y = distMeters, colour = usertype)) + geom_point() + labs(x = "Minimum Temperature", y = "Ride Distance (meters)", colour = "User Type")
ggplot(data = masterdata, aes(x = minTemp, y = distMeters, colour = usertype)) + geom_smooth() + labs(x = "Minimum Temperature", y = "Ride Distance (meters)", colour = "User Type")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
From these line charts, we can see that non-subscribers/customers have longer trip distances than subscribers. A possibility is that most non-subscribers may be tourists who rent CitiBikes to tour the city, leading them to ride longer distances than subscribers, who presumably use CitiBike as a mode of transportation to and from work.
Additionally, we observe that female riders, on average, have longer trip durations than male riders. A possible explanation could be that male riders have a lower threshold for travel distance by bicycle before deciding their destination is far enough to take the subway or an Uber/Lyft instead.
#By Maximum Temperature By Gender
ggplot(data = masterdata[masterdata$tripduration < 10000,], aes(x = maxTemp, y = tripduration, colour = gender)) + geom_point() + labs(x = "Maximum Temperature", y = "Trip Duration (seconds)", colour = "Gender")
ggplot(data = masterdata[masterdata$tripduration < 10000,], aes(x = maxTemp, y = tripduration, colour = gender)) + geom_smooth() + labs(x = "Maximum Temperature", y = "Trip Duration (seconds)", colour = "Gender")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#By Maximum Temperature By User Type
ggplot(data = masterdata[masterdata$tripduration < 10000,], aes(x = maxTemp, y = tripduration, colour = usertype)) + geom_point() + labs(x = "Maximum Temperature", y = "Trip Duration (seconds)", colour = "User Type")
ggplot(data = masterdata[masterdata$tripduration < 10000,], aes(x = maxTemp, y = tripduration, colour = usertype)) + geom_smooth() + labs(x = "Maximum Temperature", y = "Trip Duration (seconds)", colour = "User Type")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#By Minimum Temperature By Gender
ggplot(data = masterdata[masterdata$tripduration < 10000,], aes(x = minTemp, y = tripduration, colour = gender)) + geom_point() + labs(x = "Minimum Temperature", y = "Trip Duration (seconds)", colour = "Gender")
ggplot(data = masterdata[masterdata$tripduration < 10000,], aes(x = minTemp, y = tripduration, colour = gender)) + geom_smooth() + labs(x = "Minimum Temperature", y = "Trip Duration (seconds)", colour = "Gender")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#By Minimum Temperature By User Type
ggplot(data = masterdata[masterdata$tripduration < 10000,], aes(x = minTemp, y = tripduration, colour = usertype)) + geom_point() + labs(x = "Minimum Temperature", y = "Trip Duration (seconds)", colour = "User Type")
ggplot(data = masterdata[masterdata$tripduration < 10000,], aes(x = minTemp, y = tripduration, colour = usertype)) + geom_smooth() + labs(x = "Minimum Temperature", y = "Trip Duration (seconds)", colour = "User Type")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Comparing the line charts sorted by gender and user type, it seems a large portion of the “Unknown” gender population could be customers (although we can’t be sure). These could be tourists visiting the city taking longer trips.
Additionally, we notice that male riders tend to have shorter trip durations than female riders. Considering the plots in the previous section that conveyed female riders tend to ride longer distances, we can reasonably assume their longer trip durations are due to longer ride distances.
#By Day of Week
masterdata %>%
group_by(startDate, numWeekday) %>%
summarise(
numRides = mean(n())
) %>%
ggplot(aes(x = numWeekday, y = numRides)) + geom_boxplot() + labs(x = "Weekday (1 = Sunday, 7 = Saturday)", y = "Average Number of CitiBike Rides")
## `summarise()` regrouping output by 'startDate' (override with `.groups` argument)
#By Maximum Temperature
masterdata %>%
group_by(maxTemp) %>%
summarise(
numRides = mean(n())
) %>%
ggplot(aes(x = maxTemp, y = numRides)) + geom_point() + labs(x = "Maximum Temperature (F)", y = "Average Number of CitiBike Rides")
## `summarise()` ungrouping output (override with `.groups` argument)
#By Minimum Temperature
masterdata %>%
group_by(minTemp) %>%
summarise(
numRides = mean(n())
) %>%
ggplot(aes(x = minTemp, y = numRides)) + geom_point() + labs(x = "Minimum Temperature (F)", y = "Average Number of CitiBike Rides")
## `summarise()` ungrouping output (override with `.groups` argument)
#Gender
masterdata %>%
group_by(gender, maxTemp) %>%
summarise(
numRides = mean(n())
) %>%
ggplot(aes(x = maxTemp, y = numRides, colour = gender)) + geom_smooth() + labs(x = "Maximum Temperature (F)", y = "Average Number of CitiBike Rides")
## `summarise()` regrouping output by 'gender' (override with `.groups` argument)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#User Type
masterdata %>%
group_by(usertype, maxTemp) %>%
summarise(
numRides = mean(n())
) %>%
ggplot(aes(x = maxTemp, y = numRides, colour = usertype)) + geom_smooth() + labs(x = "Maximum Temperature (F)", y = "Average Number of CitiBike Rides")
## `summarise()` regrouping output by 'usertype' (override with `.groups` argument)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
According to the box plot, we can see that, on average, Wednesday is the busiest day for CitiBike. Additionally, the first and third quartiles are significantly closer together than they are on other days, meaning CitiBike can be fairly certain Wednesday will pretty much always be busy.
Dot plots for both minimum and maximum daily temperatures were created to see if either figure had a greater impact on a rider’s decision to take a CitiBike trip, using these numbers as proxies for “Highs” and “Lows” reported by weather apps. The distributions for both graphs seem quite similar, with the most rides happening on days with a maximum temperature of roughly 75 degrees, and minimum temperature of 65 degrees.
The line charts show us that non-subscribers/customers are less sensitive to temperature fluctuations than subscribers, presumably because non-subscribers are likely tourists who have no choice but to tour the city, since they’re only there for a limited amount of time. Subscribers, who are most likely residents of NYC, are much more sensitive to temperature fluctuations, as expected. Furthermore, if we break it up by gender, we see that female riders tend to be more sensitive to temperature fluctuations than male riders.
#Speed by Maximum Temperature by Gender
ggplot(data = masterdata, aes(x = maxTemp, y = speedMetersperSec, colour = gender)) + geom_point() + labs(x = "Maximum Temperature", y = "Speed (meters/s)", colour = "Gender")
ggplot(data = masterdata, aes(x = maxTemp, y = speedMetersperSec, colour = gender)) + geom_smooth() + labs(x = "Maximum Temperature", y = "Speed (meters/s)", colour = "Gender")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#Speed by Maximum Temperature by User Type
ggplot(data = masterdata, aes(x = maxTemp, y = speedMetersperSec, colour = usertype)) + geom_point() + labs(x = "Maximum Temperature", y = "Speed (meters/s)", colour = "User Type")
ggplot(data = masterdata, aes(x = maxTemp, y = speedMetersperSec, colour = usertype)) + geom_smooth() + labs(x = "Maximum Temperature", y = "Speed (meters/s)", colour = "User Type")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#Speed by Minimum Temperature by Gender
ggplot(data = masterdata, aes(x = minTemp, y = speedMetersperSec, colour = gender)) + geom_point() + labs(x = "Minimum Temperature", y = "Speed (meters/s)", colour = "Gender")
ggplot(data = masterdata, aes(x = minTemp, y = speedMetersperSec, colour = gender)) + geom_smooth() + labs(x = "Minimum Temperature", y = "Speed (meters/s)", colour = "Gender")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#Speed by Minimum Temperature by User Type
ggplot(data = masterdata, aes(x = minTemp, y = speedMetersperSec, colour = usertype)) + geom_point() + labs(x = "Minimum Temperature", y = "Speed (meters/s)", colour = "User Type")
ggplot(data = masterdata, aes(x = minTemp, y = speedMetersperSec, colour = usertype)) + geom_smooth() + labs(x = "Minimum Temperature", y = "Speed (meters/s)", colour = "User Type")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Interestingly enough, our previous hypothesis - riders whose genders are “Unknown” are mostly non-subscribers - is inconsistent with the data presented in the graphs above. As we look at average speed for the “Unknown” gender group, we see that their average speed is significantly slower than that of riders who indicated they’re either male or female. However, the line chart that groups riders by user type shows subscribers actually ride much faster than non-subscribers, which is to be expected, assuming most of the non-subscriber population consists of tourists.
Additionally, male riders tend to ride CitiBikes faster than female riders. A possible behavioral explanation to this pattern could be that, assuming male riders are more sensitive to ride distance in their decision to take either a CitiBike or subway/Uber/Lyft (as hypothesized in the “Ride Duration” section), they may have a tendency to feel more rushed in completing what they deem to be a “short trip,” leading them to ride faster; or, male riders could just be less careful when riding. There are endless possibilities for what could be causing this pattern.
Lastly, the sensitivity of these rider groups to temperature doesn’t seem to differ when it comes to speed.
#Data Setup
startStationData <- masterdata %>%
group_by(tempFeel, start.station.id) %>%
summarise(
numRides = n()
) %>%
arrange(desc(numRides)) %>%
slice(1:5) %>%
ggplot(aes(x = reorder(start.station.id, numRides, na.rm = TRUE), y = numRides)) + geom_col() + transition_states(tempFeel, transition_length = 2, state_length = 1) + enter_fade() + exit_shrink() + ease_aes('sine-in-out') + labs(title = "Weather Feel: {closest_state}") + labs(x = "Start Station ID", y = "Number of Rides")
## `summarise()` regrouping output by 'tempFeel' (override with `.groups` argument)
endStationData <- masterdata %>%
group_by(tempFeel, end.station.id) %>%
summarise(
numRides = mean(n())
) %>%
arrange(desc(numRides)) %>%
slice(1:5) %>%
ggplot(aes(x = reorder(end.station.id, numRides, na.rm = TRUE), y = numRides)) + geom_col() + transition_states(tempFeel, transition_length = 2, state_length = 1) + enter_fade() + exit_shrink() + ease_aes('sine-in-out') + labs(title = "Weather Feel: {closest_state}") + labs(x = "End Station ID", y = "Number of Rides")
## `summarise()` regrouping output by 'tempFeel' (override with `.groups` argument)
#Start Station Chart
animate(startStationData)
#End Station Chart
animate(endStationData)
From these animations, we can see that stations 3255, 497, 519, 426, and 514 tend to be the destination for many people when the temperatures get hot. These could potentially be places where pools or other fun outside activities reside. We also see that on hot days, there are many people who depart from stations 497, 519, 514, 459, and 426. No causal conclusions can be drawn from these visualizations, but they can show us what stations tend to be busier at different temperature levels.
nrow(masterdata[masterdata$PRCP < .5,])
## [1] 187806
nrow(masterdata[masterdata$PRCP >= .5 & masterdata$PRCP < 1,])
## [1] 13168
nrow(masterdata[masterdata$PRCP >= 1 & masterdata$PRCP < 1.5,])
## [1] 2712
nrow(masterdata[masterdata$PRCP >= 1.5,])
## [1] 1831
These numbers will guide the analysis below, as it is important to note that, while the averages on the y-axis may provide suggest certain insights, looking at the confidence intervals at various ranges will be useful in drawing meaningful insights. As these metrics indicate, PRCP certainly has a negative correlation with number of rides that occur, which suggests that bikers in higher PRCP may not be reflective of the typical Citibike biker.
#prcp vs Haversine Distance (distMeters) by gender
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distMeters, colour=gender)) + geom_point()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distMeters, colour=gender)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distMeters, colour=gender)) + geom_violin()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distMeters, colour=gender)) + geom_boxplot()
#prcp vs tripduration by usertype
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distMeters, colour=usertype)) + geom_point()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distMeters, colour=usertype)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distMeters, colour=usertype)) + geom_violin()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distMeters, colour=usertype)) + geom_boxplot()
Amongst unknown genders, PRCP is associated with a decrease in distance. For males and females, there seems to be a decrease in distance as PRCP increases to a certain level, after which the rate of decrease diminishes. For females, the distance begins to increase, whereas for males it mostly plateaus. This, as seen previously, may be reflective of who is biking in these various PRCP ranges. In the middle range, we can infer that people try to minimize distance if they can feasibly. Perhaps as PRCP becomes drastic, only those with a need to bike will be out, who may be not be able to adjust the distance of their trip. The disparity between male response and female response here is curious. Customers, who are likely recreational/infrequent users, predictably decrease distance in correlation to increased PRCP. Subscribers reflect a response similar to the females mentioned previously.
ggplot(data=masterdata, aes(x=PRCP,y=distMeters)) + geom_point() + facet_wrap(~ startMonthFactor)
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=distMeters)) + geom_smooth() + facet_wrap(~ startMonthFactor)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=distMeters,colour=usertype)) + geom_smooth() + facet_wrap(~ startMonthFactor)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=distMeters,colour=gender)) + geom_smooth() + facet_wrap(~ startMonthFactor)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=distMeters,colour=gender)) + geom_smooth() + facet_wrap(~ usertype)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=distMeters,colour=usertype)) + geom_smooth() + facet_wrap(~ gender)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
In colder months and August, it seems that PRCP does not have significant correlation to average distance. In other months, the correlation fluctuates or steadily yields lower distance as PRCP increases. Interestingly, April which is a very rainy month traditionally seems to have the greatest fluctuation for distance’s correlation with PRCP. Customers seem reliably unaffected by PRCP values in aggregate, except for a few interesting examples in August and May. Subscribers, again, vary greatly in their response, which may suggest that we must look into the behavioral trends of specific users to gain a full picture. While most insights from this data is fundamentally speculative, it is interesting to note the disparity in how females, males, and unknown genders vary in their response to PRCP, when separated into usertypes. Female customers seem unbothered, while female subscribers decrease distances up until a certain point and then increase again (potentially due to only necessary rides being made, which are not responsive to PRCP changes). Male customers strongly decrease distance as PRCP increases, while male subscribers reflect a similar pattern as female subscribers (potentially due to the aforementioned insight). Similar insights are yielded by separating user types into genders.
ggplot(data=masterdata, aes(x=startDate, y=tripduration, colour=gender)) + geom_point()
#newStartDate vs tripduration by gender
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=startDate, y=tripduration, colour=gender)) + geom_point()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=startDate, y=tripduration, colour=gender)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=startDate, y=tripduration, colour=gender)) + geom_violin()
## Warning: position_dodge requires non-overlapping x intervals
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=startDate, y=tripduration, colour=gender)) + geom_boxplot()
#newStartDate vs tripduration by usertype
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=startDate, y=tripduration, colour=usertype)) + geom_point()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=startDate, y=tripduration, colour=usertype)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=startDate, y=tripduration, colour=usertype)) + geom_violin()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=startDate, y=tripduration, colour=usertype)) + geom_boxplot()
As one might expect, trip duration increases during warmer months and decreases as temperature drops; this suggests that traveling longer distances is either more necessary or enjoyable in warmer months. Females on average have longer trips than men. Unknown gender has the highest trip duration, and customers have higher trip durations than subscribers. Perhaps customers do not have to reveal their gender information, and perhaps these customers differ in ways other than just status as it pertains to their trip duration. Citibike managers should keep in mind that any sort of system overhauls, construction, or repair should be placed in a month with less demand so the company does not miss out on revenue from peak times.
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=gender)) + geom_point()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=gender)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=gender)) + geom_violin()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=gender)) + geom_boxplot()
#prcp vs tripduration by usertype
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=usertype)) + geom_point()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=usertype)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=usertype)) + geom_violin()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=usertype)) + geom_boxplot()
As precipitation increases, trip duration decreases. Females again have higher average trip duration, but they seem to have a varied correlation to preciptation. Perhaps the rise/plateau at the high PRCP levels for both males and females is influenced by people who use Citibike out of necessity. This means that the primary decrease in trip duration as PRCP increases is logical, as people who can make their trips shorter will. However, beyond a certain point, the people who cannot adjust their travel will then be bringing up the overall average trip duration. Unknown genders, who may be those who are not regular users of Citibike, are likely casual bikers who will decrease their trip lengths as much as posssible, and this is what the visualization depicts. It is curious that customers have inconsistent correlation to PRCP values. Perhaps we can infer that some rain deters users from taking long trips, while there is a certain amount of rain that is considered pleasant; this certain amount can also be an amount where casual riders do not ride, and so only bikers who bike out of need are biking in the middle range. After this middle range, perhaps even those bikers begin having to compromise on their trip lengths. Biking speed may also fluctuate and be responsible for trip duration changes.
averagePRCPMonthly <- tapply(masterdata$PRCP,masterdata$startMonthFactor,mean,)
plot(averagePRCPMonthly,xlab="Month",ylab="Average PRCP")
averageTripDurationMonthly <- tapply(masterdata$tripduration,masterdata$startMonthFactor,mean,)
plot(averageTripDurationMonthly,xlab="Month",ylab="Average Trip Duration")
numTripsMonthly <- table(masterdata$startMonth)
plot(x=averagePRCPMonthly, y=averageTripDurationMonthly)
plot(x=averagePRCPMonthly, y=numTripsMonthly)
ggplot(data=masterdata, aes(x=PRCP,y=tripduration)) + geom_point() + facet_wrap(~ startMonthFactor)
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=tripduration)) + geom_smooth() + facet_wrap(~ startMonthFactor)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=tripduration,colour=usertype)) + geom_smooth() + facet_wrap(~ startMonthFactor)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=tripduration,colour=gender)) + geom_smooth() + facet_wrap(~ startMonthFactor)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=tripduration,colour=gender)) + geom_smooth() + facet_wrap(~ usertype)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=tripduration,colour=usertype)) + geom_smooth() + facet_wrap(~ gender)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
It does not appear that months with higher average PRCP correspond to lower average trip durations. This may be due to the rainier months also being warmer and more pleasant than harsh winters of NY. Perhaps the pleasant days in rainy months are very positive for bikers in general, to the extent that they compensate for rainy days. We can see that, in different months, the amount of PRCP has varied correlations with trip duration. The winter months have little to know average tripduration changes as PRCP increases, which may reflect that bikers who ride during these times are not responsive to PRCP. Customers primarily decrease trip duration as PRCP increases, except in December and June, which may be months where tourists are determined to bike no matter the PRCP; subscribers vary greatly in their responses to PRCP in each month. Similar insights can be drawn when arranging the data by gender and usertype.
ggplot(data=masterdata, aes(x=PRCP,y=speedMetersperSec)) + geom_point()
ggplot(data=masterdata, aes(x=PRCP,y=speedMetersperSec)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata, aes(x=PRCP,y=speedMetersperSec, colour = gender)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata, aes(x=PRCP,y=speedMetersperSec, colour = usertype)) + geom_smooth() + facet_wrap(~ ageGroup)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
PRCP has a general positive correlation with speed, which may indicate that bikers bike faster in rainier weather. It is important to note certain fluctuations in this correlation. Perhaps the dip in speed around PRCP=1 may indicate that this amount of rain is particularly difficult to bike in, which causes bikers to slow down.
ggplot(data=masterdata[masterdata$distMeters < 10000,], aes(x=SNOW, y=distMeters, colour=usertype)) + geom_smooth() + labs(title = "Effects of Snow on Citi Bike Ride Distance", x = "Snow Depth (inches)", y = "Distance (meters)")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 544 rows containing non-finite values (stat_smooth).
ggplot(data=masterdata[masterdata$distMeters < 10000,], aes(x=SNOW, y=distMeters, colour=usertype)) + geom_point(alpha = 0.5) + geom_jitter() + labs(title = "Effects of Snow on Citi Bike Ride Distance", x = "Snow Depth (inches)", y = "Distance (meters)")
## Warning: Removed 544 rows containing missing values (geom_point).
## Warning: Removed 544 rows containing missing values (geom_point).
ggplot(data=masterdata[masterdata$distMeters < 10000,], aes(x=roundedSNOW, y=distMeters, colour=usertype)) + geom_point(alpha = 0.5) + geom_jitter() + labs(title = "Effects of Snow on Citi Bike Ride Distance", x = "Snow Depth (rounded inches)", y = "Distance (meters)")
## Warning: Removed 544 rows containing missing values (geom_point).
## Warning: Removed 544 rows containing missing values (geom_point).
We can see the trend that customers generally ride a consistent distance regardless of snow, while subscribers tend to travel shorter distances when there is snow. This is likely due to the fact that subscribers or frequent users are more likely to be locals to NYC and use other forms of transportation instead (i.e. subway).
However, if we also factor in the amount of rides that are happening, we can see that subscribers make up a larger percentage of the total rides compared to customers whenever there is snow on the ground. Subscribers who are more likely to use a CitiBike than a regular customer in the snow (but will also ride a shorter distance).
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=SNOW, y=tripduration, colour=usertype)) + geom_smooth() + labs(title = "Effects of Snow on Citi Bike Ride Duration", x = "Snow Depth (inches)", y = "Trip Duration (secs)")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 545 rows containing non-finite values (stat_smooth).
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=SNOW, y=tripduration, colour=usertype)) + geom_point(alpha = 0.5) + geom_jitter() + labs(title = "Effects of Snow on Citi Bike Ride Duration", x = "Snow Depth (inches)", y = "Trip Duration (secs)")
## Warning: Removed 545 rows containing missing values (geom_point).
## Warning: Removed 545 rows containing missing values (geom_point).
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=roundedSNOW, y=tripduration, colour=usertype)) + geom_point(alpha = 0.5) + geom_jitter() + labs(title = "Effects of Snow on Citi Bike Ride Duration", x = "Snow Depth (rounded inches)", y = "Trip Duration (secs)")
## Warning: Removed 545 rows containing missing values (geom_point).
## Warning: Removed 545 rows containing missing values (geom_point).
totalRides <- tapply(masterdata$tripduration, masterdata$SNOW, mean, na.rm = TRUE)
barplot(totalRides)
totalRides <- tapply(masterdata$tripduration, masterdata$roundedSNOW, mean, na.rm = TRUE)
barplot(totalRides)
Trip duration drastically decreases when there is snow on the ground this is likely due to a combination of temperature and safety concerns. However, we see a much steeper drop off in ridership and trip duration among regular customers than subscribers.
A potential solution to increase business would be to increase incentives for non-subscribers to ride when their is snow (is pricing a concern for them?) >> risk: liability?
# Count - Raw Data
ggplot(data=masterdata, aes(x=SNOW)) + geom_bar()
## Warning: Removed 545 rows containing non-finite values (stat_count).
ggplot(data=masterdata[masterdata$SNOW > 0, ], aes(x=SNOW)) + geom_bar()
## Warning: Removed 545 rows containing non-finite values (stat_count).
# Count - Rounded Values
ggplot(data=masterdata, aes(x=roundedSNOW)) + geom_bar()
## Warning: Removed 545 rows containing non-finite values (stat_count).
ggplot(data=masterdata[masterdata$roundedSNOW > 0, ], aes(x=roundedSNOW)) + geom_bar()
## Warning: Removed 545 rows containing non-finite values (stat_count).
# Average - Rounded Values
masterdata %>%
group_by(roundedSNOW) %>%
summarise(
numRides = mean(n())
) %>%
ggplot(aes(x = roundedSNOW, y = numRides)) + geom_point() + ylim(0,1000) + labs(x = "Snow Depth", y = "Average Number of CitiBike Rides")
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning: Removed 2 rows containing missing values (geom_point).
As expected the more snow there is on the ground the less riders (on average per day with that amount of snow) there are. The second plots doesn’t include 0 values to remove days where there is no snow on the ground.
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=SNOW, y=tripduration, colour=gender)) + geom_smooth() + labs(title = "Effects of Snow on Citi Bike Riders based on Gender", x = "Snow Depth (inches)", y = "Trip Duration (secs)")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 545 rows containing non-finite values (stat_smooth).
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=SNOW, y=tripduration, colour=gender)) + geom_point(alpha = 0.25) + geom_jitter() + labs(title = "Effects of Snow on Citi Bike Riders based on Gender", x = "Snow Depth (inches)", y = "Trip Duration (secs)")
## Warning: Removed 545 rows containing missing values (geom_point).
## Warning: Removed 545 rows containing missing values (geom_point).
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=roundedSNOW, y=tripduration, colour=gender)) + geom_point(alpha = 0.25) + geom_jitter() + labs(title = "Effects of Snow on Citi Bike Riders based on Gender", x = "Snow Depth (rounded inches)", y = "Trip Duration (secs)")
## Warning: Removed 545 rows containing missing values (geom_point).
## Warning: Removed 545 rows containing missing values (geom_point).
It seems that males are more likely to continue riding CitiBikes when there is snow on the ground than Females or Unknowns (which is expected as Unknowns fall largely in the non-subscriber category), but the difference is minimal.
ggplot(data=masterdata, aes(x=SNOW, y=speedMetersperSec, colour=usertype)) + geom_smooth() + labs(title = "Effects of Snow on Citi Bike Rider Speed", x = "Snow Depth (inches)", y = "Speed (meters per sec)")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 545 rows containing non-finite values (stat_smooth).
Subscribers are trying to get from point A to point B and actually go faster with more snow (likely to get out of the cold and we can assume NYC streets are well plowed despite indicated snow depth)
masterdata %>%
mutate(Timings = as.POSIXct(startTime)) %>%
group_by(lubridate::hour(Timings)) %>%
summarise(count=n()) %>%
arrange(desc(count))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 24 x 2
## `lubridate::hour(Timings)` count
## <int> <int>
## 1 17 20194
## 2 18 18831
## 3 8 16185
## 4 16 14545
## 5 19 13033
## 6 9 13020
## 7 15 12473
## 8 14 12190
## 9 13 11698
## 10 12 11129
## # ... with 14 more rows
hour <- format(as.POSIXct(masterdata$startTime, format="%H:%M:%S"),"%H")
hourie <- as.factor(hour)
go <- tapply(masterdata$tripduration, hourie, mean)
barplot(go,
main="Trip Duration by Month(Average)",
names.arg=c( "00", "01", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24"),
ylab="Average Duration",
col=c("red", "white", "blue", "green", "black", "yellow", "purple", "grey", "pink", "orange"),
)
hourtable <- table(hourie)
barplot(hourtable)
masterdata$hour <- hour
# Unique Departures and Arrivals for Each Citi Bike Station
StationStartsAM <- as.data.frame(table(masterAM$start.station.name))
StationEndsAM <- as.data.frame(table(masterAM$end.station.name))
StationDataAM <- data.frame(masterAM$start.station.name)
StationDataAM <- unique(StationDataAM)
StationDataAM$numStarts <- StationStartsAM$Freq[match(StationDataAM$masterAM.start.station.name, StationStartsAM$Var1)]
StationDataAM$numEnds <- StationEndsAM$Freq[match(StationDataAM$masterAM.start.station.name, StationEndsAM$Var1)]
# Compute the difference (Arrivals > Departures)
StationDataAM$difference <- StationDataAM$numEnds - StationDataAM$numStarts
StationDataAM <- arrange(StationDataAM, desc(difference))
StationDataAM <- na.omit(StationDataAM)
# Top 10 stations that gain bikes throughout the morning
TopTenSurplusAM <- head(StationDataAM, 10)
# Top 10 stations that lose bikes throughout the morning
TopTenDeficitAM <-tail(StationDataAM, 10)
TopTenDeficitAM <- arrange(TopTenDeficitAM, difference)
#Replicate for PM times
# Unique Departures and Arrivals for Each Citi Bike Station
StationStartsPM <- as.data.frame(table(masterPM$start.station.name))
StationEndsPM <- as.data.frame(table(masterPM$end.station.name))
StationDataPM <- data.frame(masterPM$start.station.name)
StationDataPM <- unique(StationDataPM)
StationDataPM$numStarts <- StationStartsPM$Freq[match(StationDataPM$masterPM.start.station.name, StationStartsPM$Var1)]
StationDataPM$numEnds <- StationEndsPM$Freq[match(StationDataPM$masterPM.start.station.name, StationEndsPM$Var1)]
# Compute the difference (Arrivals > Departures)
StationDataPM$difference <- StationDataPM$numEnds - StationDataPM$numStarts
StationDataPM <- arrange(StationDataPM, desc(difference))
StationDataPM <- na.omit(StationDataPM)
# Top 10 stations that gain bikes throughout the morning
TopTenSurplusPM <- head(StationDataPM, 10)
# Top 10 stations that lose bikes throughout the morning
TopTenDeficitPM <-tail(StationDataPM, 10)
TopTenDeficitPM <- arrange(TopTenDeficitPM, difference)
TopTenSurplusAM
## masterAM.start.station.name numStarts numEnds difference
## 1 Broadway & E 22 St 239 623 384
## 2 North Moore St & Greenwich St 108 485 377
## 3 E 47 St & Park Ave 242 582 340
## 4 W 52 St & 6 Ave 156 439 283
## 5 W 52 St & 5 Ave 77 355 278
## 6 6 Ave & Canal St 89 364 275
## 7 Grand Army Plaza & Central Park S 169 423 254
## 8 E 24 St & Park Ave S 252 472 220
## 9 E 48 St & 5 Ave 153 356 203
## 10 Broadway & Battery Pl 91 288 197
TopTenDeficitAM
## masterAM.start.station.name numStarts numEnds difference
## 1 8 Ave & W 31 St 709 260 -449
## 2 E 13 St & Avenue A 386 125 -261
## 3 E 10 St & Avenue A 353 108 -245
## 4 Christopher St & Greenwich St 449 270 -179
## 5 12 Ave & W 40 St 404 233 -171
## 6 E 6 St & Avenue B 269 98 -171
## 7 1 Ave & E 18 St 272 103 -169
## 8 E 7 St & Avenue A 311 142 -169
## 9 E 2 St & Avenue B 266 100 -166
## 10 E 20 St & FDR Drive 260 96 -164
TopTenSurplusPM
## masterPM.start.station.name numStarts numEnds difference
## 1 8 Ave & W 31 St 491 909 418
## 2 E 10 St & Avenue A 342 607 265
## 3 E 20 St & FDR Drive 262 450 188
## 4 E 2 St & Avenue B 281 465 184
## 5 E 13 St & Avenue A 464 639 175
## 6 E 6 St & Avenue B 284 454 170
## 7 1 Ave & E 16 St 494 655 161
## 8 St Marks Pl & 1 Ave 383 541 158
## 9 12 Ave & W 40 St 610 761 151
## 10 1 Ave & E 18 St 297 445 148
TopTenDeficitPM
## masterPM.start.station.name numStarts numEnds difference
## 1 North Moore St & Greenwich St 611 258 -353
## 2 Grand Army Plaza & Central Park S 636 311 -325
## 3 Broadway & E 22 St 886 571 -315
## 4 E 47 St & Park Ave 515 223 -292
## 5 W 52 St & 5 Ave 441 191 -250
## 6 W 52 St & 6 Ave 432 195 -237
## 7 E 48 St & 5 Ave 513 280 -233
## 8 E 24 St & Park Ave S 607 391 -216
## 9 6 Ave & Canal St 394 191 -203
## 10 Broadway & Battery Pl 372 210 -162
# Top Ten gains in the morning
TenSurplusMorning <- ggplot(TopTenSurplusAM, aes(reorder(masterAM.start.station.name, - difference), difference)) +
geom_col() +
scale_x_discrete(label = function(x) stringr::str_trunc(x, 20)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
TenSurplusMorning
this chart shows a high level of asymmetry among the bike stations with station “Broadwat & E 22 St” having 384 more arrivals than departures in the morning throughout the year.
# Top Ten loses in the morning
TenDeficitMorning <- ggplot(TopTenDeficitAM, aes(reorder(masterAM.start.station.name, difference), difference)) +
geom_col() +
scale_x_discrete(label = function(x) stringr::str_trunc(x, 20)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
TenDeficitMorning
Asymmetric traffic patterns are also evident in the chart above, with station “8 Ave & W 31 St” experiencing 449 more departures than arrivals in the morning throughout the year.
# Top Ten gains in the afternoon
TenSurplusAfternoon<- ggplot(TopTenSurplusPM, aes(reorder(masterPM.start.station.name, - difference), difference)) +
geom_col() +
scale_x_discrete(label = function(x) stringr::str_trunc(x, 20)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
TenSurplusAfternoon
The evidence of asymmetric traffic is also present in this chart, with station " 8 Ave & W 31 St " leading with 418 more arrivals than departures. Some symmetry between the two time periods is seen here with a loss of 449 bikes at this same station in the morning time period throughout the year. In fact, 8 of the top 10 stations that lose bikes in the morning period are present here gaining bikes.
# Top Ten loses in the afternoon
TenDeficitAfternoon <- ggplot(TopTenDeficitPM, aes(reorder(masterPM.start.station.name, difference), difference)) +
geom_col() +
scale_x_discrete(label = function(x) stringr::str_trunc(x, 20)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
TenDeficitAfternoon
In the chart for the top ten stations that experience losses all ten of the stations that experience overflow of bikes in the morning time perion are present here.
#create counts of data
# Unique Departures and Arrivals for Each Citi Bike Station for Week
StationStartsWeek <- as.data.frame(table(weekdayData$start.station.name))
StationEndsWeek <- as.data.frame(table(weekdayData$end.station.name))
StationDataWeek <- data.frame(weekdayData$start.station.name)
StationDataWeek <- unique(StationDataWeek)
StationDataWeek$numStarts <- StationStartsWeek$Freq[match(StationDataWeek$weekdayData.start.station.name, StationStartsWeek$Var1)]
StationDataWeek$numEnds <- StationEndsWeek$Freq[match(StationDataWeek$weekdayData.start.station.name, StationEndsWeek$Var1)]
# Compute the difference (Arrivals > Departures)
StationDataWeek$difference <- StationDataWeek$numEnds - StationDataWeek$numStarts
StationDataWeek <- arrange(StationDataWeek, desc(difference))
StationDataWeek <- na.omit(StationDataWeek)
# Top 10 stations that gain bikes throughout Weekdays
TopTenSurplusWeek <- head(StationDataWeek, 10)
# Top 10 stations that lose bikes throughout Weekend Mornings
TopTenDeficitWeek <-tail(StationDataWeek, 10)
TopTenDeficitWeek <- arrange(TopTenDeficitWeek, difference)
#create counts of data
# Unique Departures and Arrivals for Each Citi Bike Station for Weekend
StationStartsWeekend <- as.data.frame(table(weekendData$start.station.name))
StationEndsWeekend <- as.data.frame(table(weekendData$end.station.name))
StationDataWeekend <- data.frame(weekendData$start.station.name)
StationDataWeekend <- unique(StationDataWeekend)
StationDataWeekend$numStarts <- StationStartsWeekend$Freq[match(StationDataWeekend$weekendData.start.station.name, StationStartsWeekend$Var1)]
StationDataWeekend$numEnds <- StationEndsWeekend$Freq[match(StationDataWeekend$weekendData.start.station.name, StationEndsWeekend$Var1)]
# Compute the difference (Arrivals > Departures)
StationDataWeekend$difference <- StationDataWeekend$numEnds - StationDataWeekend$numStarts
StationDataWeekend <- arrange(StationDataWeekend, desc(difference))
StationDataWeekend <- na.omit(StationDataWeekend)
# Top 10 stations that gain bikes throughout Weekend Mornings
TopTenSurplusWeekend <- head(StationDataWeekend, 10)
# Top 10 stations that lose bikes throughout Weekend Mornings
TopTenDeficitWeekend <-tail(StationDataWeekend, 10)
TopTenDeficitWeekend <- arrange(TopTenDeficitWeekend, difference)
# Top Surplus During Weekday Mornings
TenSurplusWeek <- ggplot(TopTenSurplusWeek, aes(reorder(weekdayData.start.station.name, - difference), difference)) +
geom_col() +
scale_x_discrete(label = function(x) stringr::str_trunc(x, 20)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
TenSurplusWeek
# Top Deficit during Week Mornings
TenDeficitWeek <- ggplot(TopTenDeficitWeek, aes(reorder(weekdayData.start.station.name, difference), difference)) +
geom_col() +
scale_x_discrete(label = function(x) stringr::str_trunc(x, 20)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
TenDeficitWeek
# Top Surplus During Weekend Mornings
TenSurplusWeekend <- ggplot(TopTenSurplusWeekend, aes(reorder(weekendData.start.station.name, - difference), difference)) +
geom_col() +
scale_x_discrete(label = function(x) stringr::str_trunc(x, 20)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
TenSurplusWeekend
# Top Deficit during Weekend Mornings
TenDeficitWeekend <- ggplot(TopTenDeficitWeekend, aes(reorder(weekendData.start.station.name, difference), difference)) +
geom_col() +
scale_x_discrete(label = function(x) stringr::str_trunc(x, 20)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
TenDeficitWeekend
Between the top deficit stations in the morning of the weekends and weekdays there is only one overlapping station: E 13th and Avenue A., further the highest deficit on weekends is around 40 (average 20 a day [40/2]) whereas during the week the highest is approximately 450(average 90 a day [450/5]). Therefore this analysis shows that there is much more asymmetry during the week as opposed to the weekend, when less people commute for work. This pattern holds true for top surplus stations in the mornings of the weekend and weekdays with no overlapping stations,and the top station during weekdays with a surplus of 375 (average 75 a day [375/5])compared to a top surplus of 55 on the weekend (average 28 a day [55/2]). In summary, this supports the hypothesis that asymmetry is mainly caused by commutes to work during the week.
CountCustomerBroadway <- nrow(masterAM[masterAM$usertype == "Customer" & masterAM$start.station.name == "Broadway & E 22 St",])
CountSubscriberBroadway <- nrow(masterAM[masterAM$usertype == "Subscriber" & masterAM$start.station.name == "Broadway & E 22 St",])
CountCustomerNMoore <- nrow(masterPM[masterPM$usertype == "Customer" & masterPM$start.station.name == "North Moore St & Greenwich St",])
CountSubscriberNMoore <- nrow(masterPM[masterPM$usertype == "Subscriber" & masterPM$start.station.name == "North Moore St & Greenwich St",])
For stations that are asymmetric, users tend to be “subscribers” as opposed to “customers”. At the station “Broadway & E 22 St” of the 239 users who started trips there 230 are subscribers while only 9. For another station “North Moore St & Greenwich” there was only 61customers while there was 550 Subscribers.
# Add longitude and latitude to the dataset
startlatitude <- c(40.7403432,40.72019521,40.75510267, 40.76132983,40.75992262,40.72243797,40.7643971,40.74096374, 40.75724568, 40.70463334)
TopTenSurplusAM$startlatitude <- startlatitude
startlongitude <- c(-73.98955109,-74.01030064,-73.97498696
,-73.97982001, -73.97648516, -74.00566443
, -73.97371465, -73.98602213, -73.97805914
, -74.01361706)
TopTenSurplusAM$startlongitude <- startlongitude
#Map TopTenSurplus AM
register_google(key = "AIzaSyDr6TG5wIRo6iXXvRbE0rV3n2EPx1jApRc")
## get station info
station.info <- TopTenSurplusAM %>%
group_by(masterAM.start.station.name) %>%
summarise(lat=as.numeric(startlatitude),
long=as.numeric(startlongitude),
difference = difference)
## `summarise()` ungrouping output (override with `.groups` argument)
## get map and plot station locations
newyork.map <- get_map(location= 'Lower Manhattan, New York',
maptype='roadmap', color='bw',source='google',zoom=12)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=Lower%20Manhattan,%20New%20York&zoom=12&size=640x640&scale=2&maptype=roadmap&language=en-EN&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Lower+Manhattan,+New+York&key=xxx
ggmap(newyork.map) +
geom_point(data=station.info,aes(x=long,y=lat,color= difference),size=5,alpha=0.75)+
scale_colour_gradient(high="red",low='green')+
theme(axis.ticks = element_blank(),axis.text = element_blank())+
xlab('')+ylab('')
As seen in the geographic groupings of the top asymmetric stations most of them are located in Lower Manhattan, specifically in areas like Midtown where there are many jobs which is true for pretty much all of these areas. Although this map only shows the top stations that gain bikes in the morning, this pattern is true for the asymmetric stations due to the overlap from the relationship between the stations.
This is the link to our dashboard. It displays an error for approximately two seconds while loading the large dataset, but then it will work as desired. https://namangupta0.shinyapps.io/to404team4/